// SPDX-License-Identifier: GPL-2.0 or GPL-3.0
// Copyright © 2018-2019 Ariadne Devos
/* sHT -- portable, adaptive, gracefully degrading SIMD */

#ifndef _sHT_VECTOR_H
#define _sHT_VECTOR_H

/** Portable vector processing (SIMD)

  Many(1) modern processors can do arithmetic on short arrays faster(2)
  and more efficient(3), if they are somewhat assisted by using special
  registers and alignment. How large the arrays must be, varies with the
  extension. (These ‘arrays’ are called *vectors*.)

  As the vector size increases, performance goes up(4). So it would be
  a good idea to take advantage of higher sizes, when available. This
  header-only module helps you to do this, portably.

  When SIMD is not available, this falls back to scalar mode.

  (1) At least, Intel and AMD chips, which were quite popular on
    laptops in 2018
  (2) Acceleration Through Stream Computing, Ulrich Drepper (Red Hat)
    2009-09-04, slide 11, 16, 26, 29, 32
  (3) Intel® 64 and IA-32 Architectures Optimization Reference
    Manual, 13.5.1.2 Vectorization (page 13), June 2016.
    Supposedly requires you to license patents to Intel in certain
    circumstances (What? Don't buy Intel processors!).
  (4) I assume this, because otherwise, only complexity would raise.

  * How to use

  Write sHT_target_@var{size}, before the function definition, where
  @var{size} is the number of bytes in the vector. It can be 2, 4, 8,
  16, 32 or 64, depending on the host system.

  For efficiency and correctness, data to be processed with SIMD should
  be aligned to @var{sHT_vector_align}.

  The number of elements in a vector can be computed by dividing its size
  by the element size. In particular, @var{sHT_size_bytes} expands to the
  number of bytes in a @var{size_t} and @var{sHT_size_lanes} calculates the
  number of lanes (entries) in a vector from its size, checking that the
  vector is large enough. (Support other types as needed.)

  @var{sHT_size_order} is the binary logarithm of @var{sHT_size_bytes}.

  @var{sHT_size_seq} produces a vector iterating the natural numbers
  ({0, 1, 2, ...}).

  * Internals

  @var{sHT_single_vector} is defined if there is only one vector size to use,
  in which case it is the binary logarithm of the size. */

#ifndef __has_attribute
# define __has_attribute(a) 0
#endif

/* gcc ignores unknown attributes, make sure the 'target' attribute
   isn't silently ignored. */
#if __has_attribute(target)
#elif (__GNUC__ > 4) || ((__GNUC__ == 4) && ((__GNUC_MINOR__ > 9) || ((__GNUC_MINOR__ == 9) && (__GNUC_PATCHLEVEL__ >= 4))))
#elif defined(__GNUC__)
# error s2 uses the attribute 'target', but that has only been introduced in GCC 4.9.4
#else
# error s2 uses the GCC attribute 'target', which is not supported by your compiler
#endif

#include <stddef.h>
#include <stdint.h>

/* These definitions assume that sizeof(T) is a correct alignment
   for any vector type T. Here is a proof:

   The C standard allows for forming an array of T. Let the array's
   length be two, and let P be a pointer to its first element.
   According to the standard (at least, I hopeso), P is correctly aligned,
   and so is P + 1.

   The statements about alignment can be stated this way:
   P = 0 (mod _Alignof(T))
   P + 1 = 0 (mod _Alignof(T))

   The following is also true:

   P = (char *) P
   P + 1 = (char *) P + sizeof(T)

   Filling in the latter values in the former:

   (char *) P = 0 (mod _Alignof(T))
   (char *) P + sizeof(T) = 0 (mod _Alignof(T))

   Subtracting the first from the second:

   sizeof(T) = 0 (mod _Alignof(T))

   This is equivalent to (1):

   _Alignof(T) | sizeof(T) (divides)

   Then, proof that if a pointer Q is sizeof(T) aligned,
   it also is _Alignof(T) aligned.

   Q = 0 (mod sizeof(T))
   ==>
   sizeof(T) | Q
   ==> (introduce (1))
   _Alignof(T) | sizeof(T)
   sizeof(T) | Q
   ==> cut (transitivity of divisibility)
   _Alignof(T) | Q
   ==>
   Q = 0 (mod _Alignof(T))
   Q.E.D */

#if defined(__x86_64__)
/* x86-64 requires SSE2 to be supported,
   so don't generate a scalar fallback. */
# define sHT_target_16 __attribute__((target ("sse2")))
# define sHT_target_32 __attribute__((target ("avx2")))
# define sHT_target_64 __attribute__((target ("avx512f")))
# define sHT_vector_align 64
#elif defined(__x86__)
# define sHT_target_4
/* AMD has something called 3DNow, but that's for floating-point,
   which s2 doesn't do. */
# define sHT_target_8 __attribute__((target ("mmx")))
# define sHT_target_16 __attribute__((target ("sse2")))
# define sHT_target_32 __attribute__((target ("avx2")))
# define sHT_target_64 __attribute__((target ("avx512f")))
# define sHT_vector_align 64
#elif defined(__aarch64__)
/* Aarch64 requires NEON to be supported. */
# define sHT_single_vector 4
# define sHT_target_16
# define sHT_vector_align 16
#elif defined(__arm__)
# define sHT_target_4
# define sHT_target_16 __attribute__((target ("fpu=neon")))
# define sHT_vector_align 16
#elif defined(__ia64__)
/* IA 64 has 32 static general purpose registers!
   (and 96 with a register window (?)). Do some unrolling
   instead of using SIMD (which doesn't seem to exist). */
# define sHT_single_vector 4
# define sHT_target_16
# define sHT_vector_align 8
/* Fallback to scalar processing. Guess the width of the
   general-purpose registers. */
#elif defined(UINT16_MAX) && !defined(UINT32_MAX)
# define sHT_single_vector 1
# define sHT_target_2
# define sHT_vector_align 2
#elif defined(UINT32_MAX) && !defined(UINT64_MAX)
# define sHT_single_vector 2
# define sHT_target_4
# define sHT_vector_align 4
#elif defined(UINT64_MAX) && !defined(UINT128_MAX)
# define sHT_single_vector 3
# define sHT_target_8
# define sHT_vector_align 8
#elif defined(UINT128_MAX) && !defined(UINT256_MAX)
# define sHT_single_vector 4
# define sHT_target_16
# define sHT_vector_align 2
#else
# error 256-bit systems are not yet supported
#endif

#if UINT16_MAX == SIZE_MAX
# define sHT_size_order 1
# define sHT_size_bytes 2
#elif UINT32_MAX == SIZE_MAX
# define sHT_size_order 2
# define sHT_size_bytes 4
#elif UINT64_MAX == SIZE_MAX
# define sHT_size_order 3
# define sHT_size_bytes 8
#elif UINT128_MAX == SIZE_MAX
# define sHT_size_order 4
# define sHT_size_bytes 16
#else
# error 256-bit systems are not yet supported
#endif

#ifndef sHT_single_vector
/** Detect SIMD capabilities

  This sets the SIMD capability flags, that will depend
  upon the advertised microarchitecture and compile-time
  architecture.

  (The flags might speculatively be incorrect, but that should only
  cause a speculative invalid instruction trap, which should be innocent.) */
void
sHT_detect_simd(void);

/** Choose a vectorized implementation

  Consider a non-empty sequence of vector sizes N, where
  sHT_target_N is defined and 1 << @var{elem_order} divides N.

  Find an index into the sequence, optimised for performance.
  (Typically, the largest possible vector size.)

  Precondition: 2**@var{elem_order} divides some sHT_target_N for some
  N (a power of two). (True for all standard integer types, whose size
  is a power of two.

  The index depends only upon the SIMD capability flags and
  compile-time host architecture.

  @var{elem_order} must be a compile-time constant.
  (E.g. @var{sHT_size_order}).

  Precondition: SIMD capabilities have been detected
  (e.g. with @var{sHT_detect_simd}). */
size_t
sHT_select_simd(int elem_order);

#else
static inline void
sHT_detect_simd(void)
{
}

#define sHT_select_simd(elem_order) \
	((void) sizeof(struct { int: -((elem_order) <= sHT_single_vector); }), (size_t) 0)
#endif

/* gcc ignores unknown attributes, make sure the 'vector_size' attribute
   isn't silently ignored. */
#if __has_attribute(vector_size)
#elif (__GNUC__ > 4) || ((__GNUC__ == 4) && ((__GNUC_MINOR__ > 0) || ((__GNUC_MINOR__ == 0) && (__GNUC_PATCHLEVEL__ >= 4))))
#elif defined(__GNUC__)
# error s2 uses the attribute 'vector_size', but that has only been introduced in GCC 4.0.4
#else
# error s2 uses the GCC attribute 'vector_size', which is not supported by your compiler
#endif

/** A vector type of size @var{bytes}, containing @var{size_t} */
#define sHT_size_vec(bytes) \
  __attribute__((vector_size(bytes))) size_t

/** Calculate the number of lanes in a vector of @var{size_t}

  @var{bytes}: the size of the vector, in bytes

  This fails to compile if the vector cannot be filled with
  @var{size_t}. */
#define sHT_size_lanes(bytes) \
  /* If @var{bytes} doesn't divide @var{sHT_size_bytes}, the struct
     would have a negative-size bitfield, which isn't allowed. It is said
     that this trick has been performed in the Linux kernel */ \
  ((void) sizeof(struct { int: -(int) ((bytes) % sHT_size_bytes); }), \
  sHT_size_bytes/(bytes))

/** Return a vector of @var{size_t} enumerating the natural numbers

  @var{bytes}: the size of the vector, in bytes

  This returns { 0, 1, 2, ... }. */
#define sHT_size_seq(bytes) \
  /* This produces a lot of warnings, because
     not all entries are used.
     TODO: patch a -Wno-vector-excess into GCC */ \
  ((sHT_size_vec(bytes)) \
   { 0, 1, 2, 3, 4, 5, 6, 7, \
     8, 9, 10, 11, 12, 13, 14, 15, \
     16, 17, 18, 19, 20, 21, 22, 23, \
     24, 25, 26, 27, 28, 29, 30, 31, })

#endif
